Retrieve the updated IMDB datasets countries.list.gz
, genres.list.gz
and ratings.list.gz
at ftp://ftp.fu-berlin.de/pub/misc/movies/database/.
Open countries.list
and ratings.list
, merge the files with the same movie names along with their IMDB rating score and country of origin and output to countryRating.txt
.
In [1]:
ratingsFile = open('ratings.list','r')
countriesFile = open('countries.list','r')
output = open('countryRating.txt','w')
In [2]:
# Start readline() at the appropriate line
while True:
if countriesFile.readline() == "COUNTRIES LIST\n":
break;
countriesFile.readline()
while True:
if ratingsFile.readline() == "MOVIE RATINGS REPORT\n":
break;
ratingsFile.readline()
ratingsFile.readline()
Out[2]:
Extract the first movie record from ratings file
In [3]:
ratingLine = ratingsFile.readline().rstrip("\n")
ratingLine = ratingLine.split()
ratingMovieName = " ".join(ratingLine[3:len(ratingLine)])
rating = ratingLine[2]
Extract the first movie record from countries file
In [4]:
countriesLine = countriesFile.readline().rstrip("\n")
i = countriesLine.rfind(")")
countries = countriesLine[i+1:len(countriesLine)]
countries = countries.strip("\t")
while countriesLine[i]!="\t":
i-=1
countriesMovieName = countriesLine[0:i+1]
countriesMovieName = countriesMovieName.replace(" ","")
countriesMovieName = countriesMovieName.rstrip(" ")
In [5]:
while True:
# Iterating over records from file
if ratingLine == "" or countriesLine[0:3] == "---":
break
# Skipping episodes from serials
if ratingMovieName.find("{") !=-1:
ratingLine = ratingsFile.readline().rstrip("\n")
if ratingLine == "":
break
ratingLine = ratingLine.split()
ratingMovieName = " ".join(ratingLine[3:len(ratingLine)])
rating = ratingLine[2]
continue
if countriesLine.find("{")!=-1 or countriesLine.find(")")==-1:
countriesLine = countriesFile.readline().rstrip("\n")
if countriesLine[0:3] == "---":
break
i = countriesLine.rfind(")")
countries = countriesLine[i+1:len(countriesLine)]
countries = countries.strip("\t")
while countriesLine[i]!="\t":
i-=1
countriesMovieName = countriesLine[0:i+1]
countriesMovieName = countriesMovieName.replace(" ","")
countriesMovieName = countriesMovieName.rstrip(" ")
continue
# Adding matched record from ratings and countries file
if ratingMovieName == countriesMovieName:
output.write(ratingMovieName)
output.write("\t")
output.write(rating)
output.write("\t")
output.write(countries)
output.write("\n")
ratingLine = ratingsFile.readline().rstrip("\n")
if ratingLine == "":
break
ratingLine = ratingLine.split()
ratingMovieName = " ".join(ratingLine[3:len(ratingLine)])
rating = ratingLine[2]
countriesLine = countriesFile.readline().rstrip("\n")
if countriesLine[0:3] == "---":
break
if countriesLine.find(")")==-1:
continue
i = countriesLine.rfind(")")
countries = countriesLine[i+1:len(countriesLine)]
countries = countries.strip("\t")
while countriesLine[i] != "\t":
i-=1
countriesMovieName = countriesLine[0:i+1]
countriesMovieName = countriesMovieName.replace(" ","")
countriesMovieName = countriesMovieName.rstrip(" ")
continue
if ratingLine == "" or countriesLine[0:3] == "---":
break
while ratingMovieName < countriesMovieName:
ratingLine = ratingsFile.readline().rstrip("\n")
if ratingLine == "":
break
if ratingLine.find("{")!=-1:
continue
ratingLine = ratingLine.split()
ratingMovieName = " ".join(ratingLine[3:len(ratingLine)])
rating = ratingLine[2]
while countriesMovieName < ratingMovieName:
countriesLine = countriesFile.readline().rstrip("\n")
if countriesLine[0:3] == "---":
break
if countriesLine.find(")") == -1 or countriesLine.find("{")!=-1:
continue
i = countriesLine.rfind(")")
countries = countriesLine[i+1:len(countriesLine)]
countries = countries.strip("\t")
while countriesLine[i]!="\t":
i-=1
countriesMovieName = countriesLine[0:i+1]
countriesMovieName = countriesMovieName.replace(" ","")
countriesMovieName = countriesMovieName.rstrip(" ")
Close the files
In [6]:
ratingsFile.close()
countriesFile.close()
output.close()
Open countryRating.txt
and calculate the average IMDB movie rating for each country
In [7]:
import csv
from collections import defaultdict, namedtuple
from operator import attrgetter, itemgetter
from itertools import imap
In [8]:
CountryRating = namedtuple('CountryRating', 'countryorigin ratingscore')
fieldnames = 'name', 'score', 'country'
score_and_country = itemgetter('score', 'country')
ratings = defaultdict(list)
In [9]:
with open("countryRating.txt", "r") as moviefile:
movies = csv.DictReader(moviefile, fieldnames=fieldnames, delimiter='\t')
for score, country in imap(score_and_country, movies):
# Relabel some countries to their present day counterparts
if country == 'West Germany':
country = 'Germany'
if country == 'East Germany':
country = 'Germany'
if country == 'North Vietnam':
country = 'Vietnam'
if country == 'Korea':
country = 'South Korea'
if country == 'Palestine':
country = 'Occupied Palestinian Territory'
if country == 'Soviet Union':
country = 'Russia'
if country == 'Dominica':
country = 'Dominican Republic'
if country == 'Yugoslavia':
country = 'Federal Republic of Yugoslavia'
ratings[country].append(float(score))
In [10]:
average = lambda alist: sum(alist) / len(alist)
average_ratings = [CountryRating(country, average(scores)) for country, scores in ratings.iteritems()]
print "\nCountries with the highest average movie rating"
print "-----------------------------------------------"
sorted_ratings = sorted(average_ratings, key=attrgetter('ratingscore'), reverse=True)
for i, j in enumerate(sorted_ratings):
print '%i. %s \t%g' % (i + 1, j.countryorigin, j.ratingscore)
In [11]:
from IPython.display import Image
Image(filename='world map.png')
Out[11]:
In [12]:
# Create a wordle from the titles of all movies in a genre
# 1. start line at "!Next?" (1994) Documentary
# 2. discard the titles with {}
# 3. select movie titles only from 1 genre (eg. Comedy)
# 4. remove "" from the movie titles
genresFile = open('genres.list','r')
output = open('comedyNames.txt','w') #or try comedyNames.list
# To start readline() at the right line
while True:
if genresFile.readline() == "8: THE GENRES LIST\n":
break;
genresFile.readline()
genresFile.readline()
Out[12]:
In [13]:
# Remove all the second brackets eg. (TV), (V), (VG)
# Remove all the " " and punctuations from the movie titles
for line in genresFile.readlines():
genresLine = line.replace("(VG)"," ").replace("(TV)"," ").replace("(V)"," ").rstrip("\n").split()
#position of the genres = len(genresLine)-1
genres = genresLine[len(genresLine)-1]
#print genres # this prints out the genres of every movie
# Only print the name of movies that are under the genre 'Comedy'
if genres.find("Comedy") != -1:
# Skip the movies with "{}", ie. detect for "}" at position len(genresLine)-2
genresYear = genresLine[len(genresLine)-2]
# This skips the movies with "{}"
if genresYear.find("}") == -1:
genresMovieName = " ".join(genresLine[0:len(genresLine)-2])
# Now remove the quotation marks " " around the movie names
if genresMovieName.startswith('"') and genresMovieName.endswith('"'):
genresMovieName = genresMovieName[1:-1]
output.write(genresMovieName)
output.write("\n")
#print genresMovieName
if genresLine=="":
break
Closing files
In [14]:
genresFile.close()
output.close()
Read comedyNames.txt
, filter a selected list of stopwords from the movie names, then output the filtered list into a new text file
In [15]:
import re
# Set a list of stopwords to be removed from the movie titles
stopwords = set(('A', 'al', 'Al', 'auf', 'Auf', 'da', 'Da', 'Dans', 'das', 'Das', 'de', 'De',
'del', 'Del', 'der', 'Der', 'des', 'Des', 'di', 'Die', 'du', 'ein', 'Ein',
'el', 'El', 'en', 'En', 'es', 'et', 'Et', 'Ich', 'il', 'Il', 'ja', 'la',
'La', 'las', 'Las', 'le', 'Le', 'les', 'Les', 'lo', 'Lo', 'los', 'Los',
'mi', 'Mi', 'na', 'ni', 'Por', 'que', 'Que', 'se', 'Se', 'Um', 'un', 'Un',
'una', 'Una', 'und', 'une', 'Une'))
comedyNames = open('comedyNames.txt')
In [16]:
OUT = open('comedyFiltered.txt', 'w')
for line in comedyNames.readlines():
movieNames = line.rstrip("\n").split()
if any(c in movieNames for c in stopwords):
filteredNames = " ".join([i for i in movieNames if i not in stopwords])
OUT.write(filteredNames)
OUT.write("\n")
#print filteredNames
else:
filteredNames = " ".join(movieNames)
OUT.write(filteredNames)
OUT.write("\n")
#print filteredNames
OUT.close()
In [19]:
Image(filename='Comedy title wordle.png')
Out[19]:
Create a line plot (the new IBM Many Eyes doesn't have stackgrpah visualization anymore) of the number of movies made in each genre (for an individual country or all countries combined) over time.
In [20]:
genresFile = open('genres.list','r')
output = open('genreYear.txt','w')
#To start readline() at the right line
while True:
if genresFile.readline() == "8: THE GENRES LIST\n":
break;
genresFile.readline()
genresFile.readline()
Out[20]:
In [21]:
#Remove all the second brackets eg. (TV), (V), (VG)
for line in genresFile.readlines():
genresLine = line.replace("(VG)"," ").replace("(TV)"," ").replace("(V)"," ").rstrip("\n").split()
#position of the genres = len(genresLine)-1
genres = genresLine[len(genresLine)-1]
#skip the 1 movie where the genre is _//bbfc.co.uk/releases/import-export-2008-0_
if genres.find("_") != -1:
continue
genresYear = genresLine[len(genresLine)-2]
#remove the parenthesis () around the movie years
if genresYear.startswith('(') and genresYear.endswith(')'):
genresYear = genresYear[1:-1]
#skip the movies where the year is ????
if genresYear.find("?") != -1:
continue
#Cleanup steps, to remove the /I, /II, /IV, /V, /IX, /X etc from the movie years
#remove the /V from the movie years
if genresYear.endswith('V'):
genresYear = genresYear[0:-2]
#remove the /I and /II from the movie years
if genresYear.endswith('I'):
genresYear = genresYear[0:-2]
#remove the remaining /I from the movie years
if genresYear.endswith('I'):
genresYear = genresYear[0:-2]
#remove the remaining /V from the movie years
if genresYear.endswith('V'):
genresYear = genresYear[0:-2]
#remove the /X from the movie years
if genresYear.endswith('X'):
genresYear = genresYear[0:-2]
#remove the /X from the movie years
if genresYear.endswith('X'):
genresYear = genresYear[0:-2]
#remove the remaining /X from the movie years
if genresYear.endswith('X'):
genresYear = genresYear[0:-2]
#remove the r/XL from the movie years
if genresYear.endswith('L'):
genresYear = genresYear[0:-2]
#remove all the remaining / from the movie years
if genresYear.endswith('/'):
genresYear = genresYear[0:-1]
# skip the movies with {}
if genresYear.find("}") == -1:
output.write(genres)
output.write("\t")
output.write(genresYear)
output.write("\n")
#print genresYear + "\t" + genres
if genresLine=="":
break
Closing files
In [22]:
genresFile.close()
output.close()
Read genreYear.txt and sort/count the genres and the years Print out the following format:
Year 1880 1887 1888 1889 1890 1891
Action 0 0 0 0 0 1
note: genreYear.txt
has a blank line in the last line, delete that line before running this code
In [23]:
from collections import Counter
with open('genreYear.txt') as sortGY:
lines = sortGY.read().split('\n')
# Replace separating whitespace with exactly one space
lines = [' '.join(l.split()) for l in lines]
# Sort genres and years
genres = sorted(set(l.split()[0] for l in lines))
years = sorted(set(l.split()[1] for l in lines))
# Count the sorted genres and years
countGY = Counter(lines)
In [24]:
OUT = open("sortedYear.txt", "w")
OUT.write("Year" + "\t",)
print "Year" + '\t',
for y in years:
OUT.write(y + '\t',)
print y + '\t',
print
OUT.write('\n')
for g in genres:
OUT.write(g + '\t',)
print g + '\t',
for y in years:
print `countGY[g + ' ' + y]` + '\t',
OUT.write(`countGY[g + ' ' + y]` + '\t',)
OUT.write('\n')
print
OUT.close()
In [25]:
Image(filename='Movie Genre lineplot.png')
Out[25]:
According to the IMDB moive genres data, there was an increasing number of films that were produced after 1900 and peaked in 1913, with 8782 Shorts, 3232 Dramas, 2961 Comedies, 976 Documentaries and 498 Westerns as the 5 most popular movie genres that year. There was a sharp decline of movies produced between 1914 and 1917, likely becuase of the onset of World War I that disrupted the movie industry. Of the 33 movie genres, Short, Drama and Comedy remained the 3 most popular movie genres throughout most of the 20th century, except for 1990 to 2009, when Documentary and Adult films emerged as competing genres to Dramas and Comedies. The most number of films were produced in 2013, with 38741 Shorts, 22166 Dramas, 15114 Comedies, 12022 Documentaries and 4976 Thrillers made that year respectively.
In [ ]: